NOTE: This document was generated with sctransform version 0.3.2.9003

Introduction

With this vignette we explore whether sctransform can be used with Smart-Seq2 data and what changes to the default parameters might be needed.

As an example data set we use the Smart-Seq2 data from the Tabula muris project.

Load and subset data

We have already loaded the data as a list facs with components counts and meta_data. Here we have already subsetted the data to contain only cells from bone marrow. There are a total of 5355 cells.

Explore data

Distribution of total counts per cell

# we are going to ignore the spike-in controls
is_ercc <- grepl(pattern = "^ERCC-", x = rownames(facs$counts))
facs$meta_data$counts <- sparseMatrixStats::colSums2(x = facs$counts, rows = !is_ercc)

p1 <- ggplot(facs$meta_data, aes(plate, log10(counts))) + geom_boxplot(aes(fill = mouse.id)) + 
    coord_flip()
show(p1)

Distribution of genes detected per cell

facs$meta_data$genes <- sparseMatrixStats::colSums2(x = facs$counts > 0, rows = !is_ercc)

p1 <- ggplot(facs$meta_data, aes(plate, log10(genes))) + geom_boxplot(aes(fill = mouse.id)) + 
    coord_flip()
show(p1)

Plot some more cell attributes: Distribution of spike in controls (ERCC), ribosomal genes, mitochondrial read fraction

facs$meta_data$pct_ercc <- sparseMatrixStats::colSums2(x = facs$counts, rows = is_ercc)/sparseMatrixStats::colSums2(x = facs$counts) * 
    100

is_ribo <- grepl("^Mrp[sl]", rownames(facs$counts))
facs$meta_data$pct_ribo <- sparseMatrixStats::colSums2(x = facs$counts, rows = is_ribo)/facs$meta_data$counts * 
    100

mitocarta <- readxl::read_excel(path = file.path(base_dir, "data", "Mouse.MitoCarta3.0.xls"), 
    sheet = 2)
mito_genes <- mitocarta$Symbol[1:400]
is_mito <- rownames(facs$counts) %in% mito_genes
facs$meta_data$pct_mito <- sparseMatrixStats::colSums2(x = facs$counts, rows = is_mito)/facs$meta_data$counts * 
    100

entropy <- function(p) {
    -sum(p * log(p), na.rm = TRUE)
}
facs$meta_data$entropy <- apply(t(facs$counts[!is_ercc, ])/facs$meta_data$counts, 
    1, entropy)

p1 <- ggplot(facs$meta_data, aes(plate, pct_ercc)) + geom_boxplot(aes(fill = mouse.id)) + 
    coord_flip()
p2 <- ggplot(facs$meta_data, aes(plate, pct_ribo)) + geom_boxplot(aes(fill = mouse.id)) + 
    coord_flip()
p3 <- ggplot(facs$meta_data, aes(plate, pct_mito)) + geom_boxplot(aes(fill = mouse.id)) + 
    coord_flip()
p4 <- ggplot(facs$meta_data, aes(plate, exp(entropy))) + geom_boxplot(aes(fill = mouse.id)) + 
    coord_flip()
show(((p1 + p2)/(p3 + p4)) + plot_layout(guides = "collect"))

Number of cells per mouse

group_by(facs$meta_data, mouse.id) %>% summarise(n = n())
mouse.id n
3_10_M 1266
3_38_F 1081
3_39_F 808
3_8_M 1177
3_9_M 1023

Normalizing the data using sctransform::vst

vst_out <- vst(umi = facs$counts[!is_ercc, ], cell_attr = facs$meta_data, method = "qpoisson", 
    return_cell_attr = TRUE, verbosity = 1)
#> Calculating cell attributes from input UMI matrix: log_umi
#> Variance stabilizing transformation of count matrix of size 17479 by 5355
#> Model formula is y ~ log_umi
#> Get Negative Binomial regression parameters per gene
#> Using 2000 genes, 5355 cells
#> There are 6 estimated thetas smaller than 1e-07 - will be set to 1e-07
#> Found 12 outliers - those will be ignored in fitting/regularization step
#> Second step: Get residuals using fitted parameters for 17479 genes
#> Calculating gene attributes
#> Wall clock passed: Time difference of 18.61333 secs

Distribution of cell counts and gene mean

p1 <- ggplot(vst_out$cell_attr, aes(log_umi)) + geom_histogram(binwidth = 0.05) + 
    xlab("Total counts per cell [log10]") + ylab("Number of cells")
p2 <- ggplot(vst_out$gene_attr, aes(log10(gmean))) + geom_histogram(binwidth = 0.05) + 
    xlab("Geometric mean of gene [log10]") + ylab("Number of genes")
show(p1 | p2)

Look at model parameters

p1 <- plot_model_pars(vst_out, show_theta = TRUE, show_var = TRUE, verbosity = 0)
show(p1)

It looks like there is a relationship between the mean of a gene and the model parameters and it is similar in style to what we see with UMI data. However, we see a much higher variance for all genes compared to UMI data where we usually see var = mean for genes with mean < 10.

Mean-variance plot

ga <- tibble::rownames_to_column(vst_out$gene_attr, var = "gene") %>% mutate(highlight = rank(-residual_variance) <= 
    25)

p1 <- ggplot(ga, aes(log10(gmean), sqrt(residual_variance), label = gene)) + geom_point() + 
    geom_smooth() + geom_text_repel(data = filter(ga, highlight), size = 3, color = "red") + 
    geom_point(data = filter(ga, highlight), size = 0.66, color = "red") + ggtitle("Residual variance as function of gene mean")
show(p1)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Look at the residual mean and variance for all genes

p1 <- ggplot(vst_out$gene_attr, aes(residual_mean)) + geom_histogram(binwidth = 0.01) + 
    xlim(-3, 3)
p2 <- ggplot(vst_out$gene_attr, aes(residual_variance)) + geom_histogram(binwidth = 0.1) + 
    geom_vline(xintercept = 1, color = "red") + xlim(0, 10)
show(p1 + p2)

The residual variance is quite covers a wide range is not as peaked around 1 as we are used to from UMI data. Perhaps this is not surprising since we do not have UMIs to remove technical noise introduced during PCR amplification.

As a result we might want to change the way we clip very high residuals to make sure signal from biologically variable genes is not dampened.

Look at gene models

Look at the models of the top variable genes

goi <- filter(ga, rank(-residual_variance) < 5) %>% pull(gene)
plot_model(x = vst_out, umi = facs$counts, goi = goi, plot_residual = TRUE)

In comparison, show non-variable genes with similar mean

goi2 <- sapply(goi, function(g) {
    g_gmean <- filter(ga, gene == g) %>% pull(gmean)
    (filter(ga, residual_variance < 1.25, residual_variance > 0.75) %>% arrange(abs(gmean - 
        g_gmean)) %>% pull(gene))[1]
})
plot_model(x = vst_out, umi = facs$counts, goi = goi2, plot_residual = TRUE)

Seurat with SCTransform

Basic workflow

s <- CreateSeuratObject(counts = facs$counts[!is_ercc, ], meta.data = facs$meta_data)
# run sctransform with less additional clipping of the residuals and all cells
s <- SCTransform(s, verbose = FALSE, method = "qpoisson", clip.range = sqrt(ncol(s))/5 * 
    c(-1, 1), ncells = Inf)
# 'standard' workflow
s <- RunPCA(s, verbose = FALSE)
s <- RunUMAP(s, dims = 1:30, verbose = FALSE)
s <- FindNeighbors(s, dims = 1:30, verbose = FALSE)
s <- FindClusters(s, verbose = FALSE)
p1 <- DimPlot(s, label = TRUE) + NoLegend() + ggtitle("Unsupervised clustering") + 
    theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank())
p2 <- DimPlot(s, label = TRUE, group.by = "subtissue") + ggtitle("Meta data subtissue label") + 
    theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank())
show(p1 + p2)

Cluster markers

Find the top markers of the unsupervised clusters

de_res <- diff_mean_test(y = GetAssayData(s, assay = "SCT", slot = "counts"), group_labels = s$seurat_clusters, 
    R = 49, mean_th = 10, only_pos = TRUE, only_top_n = 222, verbosity = 0)
top_markers <- group_by(de_res, group1) %>% filter(rank(-zscore, ties.method = "first") <= 
    4 | rank(-log2FC, ties.method = "first") <= 4) %>% select(group1, gene, mean1, 
    mean2, log2FC, zscore, emp_pval_adj)

p1 <- ggplot(de_res, aes(pmin(log2FC, 10), pmin(log10(zscore), 4))) + geom_point(aes(color = emp_pval_adj < 
    0.05)) + geom_point(data = top_markers, color = "deeppink") + geom_text_repel(data = top_markers, 
    mapping = aes(label = gene)) + facet_wrap(~group1, ncol = 5) + theme(legend.position = "bottom")
show(p1)

Table of top markers per cluster

DT::datatable(top_markers, rownames = FALSE) %>% DT::formatRound(3:7, digits = 2)

Seurat heatmap

marker_genes <- group_by(de_res, group1) %>% filter(rank(-log2FC, ties.method = "first") <= 
    4) %>% pull(gene)
# make sure all markers are in the scale.data slot (by default only the highly
# variable genes are there)
s <- GetResidual(s, features = marker_genes, verbose = FALSE)
DoHeatmap(s, features = marker_genes, slot = "scale.data")

Correlation of meta data with clusters

s$log_counts <- log10(s$nCount_RNA)
FeaturePlot(s, features = c("pct_ercc", "pct_mito", "pct_ribo", "log_counts"))

Contribution of mice to clusters

p1 <- group_by(s@meta.data, seurat_clusters, mouse.id) %>% summarise(cells = n()) %>% 
    ggplot(aes(seurat_clusters, cells, fill = mouse.id)) + geom_bar(stat = "identity")
p2 <- group_by(s@meta.data, seurat_clusters) %>% mutate(n = n()) %>% group_by(seurat_clusters, 
    mouse.id) %>% summarise(frequency = n()/n[1]) %>% ggplot(aes(seurat_clusters, 
    frequency, fill = mouse.id)) + geom_bar(stat = "identity")
show((p1 + p2) + plot_layout(guides = "collect"))

Seurat with log-normalization

Basic workflow

s <- CreateSeuratObject(counts = facs$counts[!is_ercc, ], meta.data = facs$meta_data)
s <- NormalizeData(s, normalization.method = "LogNormalize", verbose = FALSE)
s <- FindVariableFeatures(s, verbose = FALSE)
s <- ScaleData(s, verbose = FALSE)
s <- RunPCA(s, verbose = FALSE)
s <- RunUMAP(s, dims = 1:30, verbose = FALSE)
s <- FindNeighbors(s, dims = 1:30, verbose = FALSE)
s <- FindClusters(s, verbose = FALSE)
p1 <- DimPlot(s, label = TRUE) + NoLegend() + ggtitle("Unsupervised clustering") + 
    theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank())
p2 <- DimPlot(s, label = TRUE, group.by = "subtissue") + ggtitle("Meta data subtissue label") + 
    theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank())
show(p1 + p2)

Cluster markers

Find the top markers of the unsupervised clusters

s_markers <- FindAllMarkers(s, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 1, 
    verbose = FALSE)

top_markers <- group_by(s_markers, cluster) %>% filter(rank(-avg_log2FC, ties.method = "first") <= 
    4) %>% select(cluster, gene, avg_log2FC, pct.1, pct.2, p_val_adj)

Table of top markers per cluster

DT::datatable(top_markers, rownames = FALSE) %>% DT::formatRound(3:6, digits = 2)

Seurat heatmap

DoHeatmap(s, features = top_markers$gene) + NoLegend()

Correlation of meta data with clusters

s$log_counts <- log10(s$nCount_RNA)
FeaturePlot(s, features = c("pct_ercc", "pct_mito", "pct_ribo", "log_counts"))

Contribution of mice to clusters

p1 <- group_by(s@meta.data, seurat_clusters, mouse.id) %>% summarise(cells = n()) %>% 
    ggplot(aes(seurat_clusters, cells, fill = mouse.id)) + geom_bar(stat = "identity")
p2 <- group_by(s@meta.data, seurat_clusters) %>% mutate(n = n()) %>% group_by(seurat_clusters, 
    mouse.id) %>% summarise(frequency = n()/n[1]) %>% ggplot(aes(seurat_clusters, 
    frequency, fill = mouse.id)) + geom_bar(stat = "identity")
show((p1 + p2) + plot_layout(guides = "collect"))

Discussion

The count distribution for Smart-Seq2 data looks quite different from UMI-based data. Even low counts get are amplified such that there is a clear distinction between drop-outs (gene not detected) and low expression. In contrast, for UMI data this is a continuous gradient. The regression with a Negative Binomial model works, but is likely not the appropriate model. It is not cleat what benefits the default sctrasnsform workflow provides compared to the ‘standard’ log-normalization workflow currently used by Seurat.

Session info and runtime

Session info

sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Catalina 10.15.7
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] patchwork_1.1.0.9000   ggrepel_0.8.2          dplyr_1.0.2           
#> [4] knitr_1.30             Seurat_3.9.9.9008      sctransform_0.3.2.9003
#> [7] reshape2_1.4.4         ggplot2_3.3.2          Matrix_1.2-18         
#> 
#> loaded via a namespace (and not attached):
#>   [1] nlme_3.1-149            matrixStats_0.57.0      RcppAnnoy_0.0.16       
#>   [4] RColorBrewer_1.1-2      httr_1.4.2              tools_4.0.2            
#>   [7] DT_0.16                 R6_2.5.0                irlba_2.3.3            
#>  [10] rpart_4.1-15            KernSmooth_2.23-17      uwot_0.1.8.9001        
#>  [13] mgcv_1.8-33             lazyeval_0.2.2          colorspace_2.0-0       
#>  [16] withr_2.3.0             tidyselect_1.1.0        gridExtra_2.3          
#>  [19] compiler_4.0.2          formatR_1.7             plotly_4.9.2.1         
#>  [22] labeling_0.4.2          scales_1.1.1            lmtest_0.9-38          
#>  [25] spatstat.data_1.4-3     ggridges_0.5.2          pbapply_1.4-3          
#>  [28] spatstat_1.64-1         goftest_1.2-2           stringr_1.4.0          
#>  [31] digest_0.6.27           spatstat.utils_1.17-0   rmarkdown_2.5          
#>  [34] pkgconfig_2.0.3         htmltools_0.5.0         MatrixGenerics_1.3.0   
#>  [37] sparseMatrixStats_1.3.1 limma_3.44.3            highr_0.8              
#>  [40] fastmap_1.0.1           readxl_1.3.1            htmlwidgets_1.5.2      
#>  [43] rlang_0.4.9             shiny_1.5.0             farver_2.0.3           
#>  [46] generics_0.0.2          zoo_1.8-8               jsonlite_1.7.2         
#>  [49] crosstalk_1.1.0.1       ica_1.0-2               magrittr_2.0.1         
#>  [52] Rcpp_1.0.5              munsell_0.5.0           abind_1.4-5            
#>  [55] reticulate_1.16         lifecycle_0.2.0         stringi_1.5.3          
#>  [58] yaml_2.2.1              MASS_7.3-53             Rtsne_0.15             
#>  [61] plyr_1.8.6              grid_4.0.2              parallel_4.0.2         
#>  [64] listenv_0.8.0           promises_1.1.1          crayon_1.3.4.9000      
#>  [67] deldir_0.1-29           miniUI_0.1.1.1          lattice_0.20-41        
#>  [70] cowplot_1.1.0           splines_4.0.2           tensor_1.5             
#>  [73] pillar_1.4.7            igraph_1.2.6            future.apply_1.6.0     
#>  [76] codetools_0.2-16        leiden_0.3.3            glue_1.4.2             
#>  [79] evaluate_0.14           data.table_1.13.2       vctrs_0.3.5            
#>  [82] png_0.1-7               httpuv_1.5.4            cellranger_1.1.0       
#>  [85] gtable_0.3.0            RANN_2.6.1              purrr_0.3.4            
#>  [88] polyclip_1.10-0         tidyr_1.1.2             future_1.19.1          
#>  [91] xfun_0.19               rsvd_1.0.3              mime_0.9               
#>  [94] xtable_1.8-4            RSpectra_0.16-0         later_1.1.0.1          
#>  [97] survival_3.2-3          viridisLite_0.3.0       tibble_3.0.4           
#> [100] cluster_2.1.0           globals_0.13.1          fitdistrplus_1.1-1     
#> [103] ellipsis_0.3.1          ROCR_1.0-11

Runtime

print(proc.time() - tic)
#>    user  system elapsed 
#> 438.618  37.104 482.262